In [1]:
import pandas as pd
import re
import numpy as np
import warnings
import datetime
Add fixturing method using graph processing
Add option to use one of the following fixturing methods:
Graph processing
Method based on elo relative to mean - statistical method?
Add caching
Needs to:
Check for the existence of a cache
If cache doesn't exist, create it
If cache does exist:
Check if the result is in the cache
If the result is in the cache, retrieve the result and don't process the game "normally"
Caching might have issues with teams playing games out of order?
Might need to write team elos and fixturing outcomes to the results sheet at the time of fixturing?
In [2]:
#################################### OPTIONS CHUNK ####################################
#################################### OPTIONS CHUNK ####################################
#################################### OPTIONS CHUNK ####################################
#################################### OPTIONS CHUNK ####################################
#################################### OPTIONS CHUNK ####################################
# Usecache - process results and cache them
# Caching needs to be properly implemented
usecache = False
# Process results - probably don't touch this?
processresults = True
# Warnings from pandas keep appearing
# Warning me that value is trying to be set on a copy of a slice from a DataFrame
# Not sure why the warnings are being passed
suppress_warnings = True
# Dead teams - the teams that do not exist any more - add them to the list
deadteams = ['The Swanbourne Supremacy','']
# Fixture teams - only mark false if you don't want to fixture the teams
fixtureteams = True
# Lots of text output if this is True - mainly for monitoring and testing
# Reporttime for timing - you can say verbose = False and reporttime = True if you only want to see the
# runtime of the script
verbose = True
reporttime = True
In [3]:
# Misc pre-work
if suppress_warnings:
warnings.filterwarnings('ignore')
# Timing stuff
starttime = datetime.datetime.now()
resultsfinish = False
fixturingfinish = False
In [4]:
mens_mixed_url = 'https://docs.google.com/spreadsheets/d/15TNjIJDkopneZ6PWg4IGhTQNvTl-T5U8u_i9zgJPEXA/export?format=csv&id=15TNjIJDkopneZ6PWg4IGhTQNvTl-T5U8u_i9zgJPEXA&gid=0'
elos_workbook = 'https://raw.githubusercontent.com/robfox92/HockeyElo/master/Elos_2016b_week_1.csv'
requests_sheet_url = 'https://docs.google.com/spreadsheets/d/15TNjIJDkopneZ6PWg4IGhTQNvTl-T5U8u_i9zgJPEXA/export?format=csv&id=15TNjIJDkopneZ6PWg4IGhTQNvTl-T5U8u_i9zgJPEXA&gid=2236421'
if usecache:
cached_games=pd.read_csv('processed results.csv')
mens_mixed_raw = pd.read_csv(mens_mixed_url)
elos_start_raw = pd.read_csv(elos_workbook)
games_requested = pd.read_csv(requests_sheet_url)
In [5]:
def removepunct(str_in):
for n in [".",",","!","'",'"','\n','-']:
str_in = str_in.replace(n,'')
str_out = str_in.lower()
return unicode(str_out)
def parse_round(x):
y = None
parsed = None
if type(x) == str:
parsed = re.search(r'([0-9]+)+',x).group(1)
y = int(parsed)
if (type(x) == int) or (type(x) == float):
y = int(x)
return y
def getKfactor(x):
newteam = 'New Team'
newteamK = 75
oldteamK = 50
if x == newteam:
out = newteamK
else:
out = oldteamK
return out
In [6]:
# Create game codes
mens_mixed_raw['Game Code'] = mens_mixed_raw['HOME'] + " vs " + mens_mixed_raw['AWAY']
playedgames = mens_mixed_raw['Game Code'].unique()
hometeams = mens_mixed_raw['HOME'].apply(removepunct).unique()
In [7]:
# Only get results that are validated
results = mens_mixed_raw[mens_mixed_raw['Validate'] == 'Y']
results['Round_number'] = results['ROUND'].apply(parse_round)
results['HOME'] = results['HOME'].apply(removepunct)
results['AWAY'] = results['AWAY'].apply(removepunct)
results['Game Code'] = results['Game Code'].apply(removepunct)
results.reset_index(inplace=True)
In [8]:
team_elos = elos_start_raw
team_elos['2016b Teams lower'] = elos_start_raw['2016b Teams'].apply(removepunct)
elos_dict = dict(zip(team_elos['2016b Teams lower'],team_elos['Starting Elo']))
team_elos['K Factor'] = team_elos['New Team'].apply(getKfactor)
team_K_factors = dict(zip(team_elos['2016b Teams lower'],team_elos['K Factor']))
In [9]:
results['Home Elo'] = None
results['Away Elo'] = None
elos_dict = dict(zip(team_elos['2016b Teams lower'],team_elos['Starting Elo']))
In [10]:
if usecache:
processedgames = cached_games['Game Code'].unique()
In [11]:
if verbose:
print "Beginning processing of results"
if usecache:
print "Using cache to avoid repeating result processing"
if not usecache:
print "Not using cache - this may slow down the processing"
#%%timeit -n 1 -r 1
team_elos_df = pd.DataFrame.from_dict(elos_dict,orient='index')
if processresults:
for row in range(0,len(results)):
if row%50 == 0 and row != 0 and verbose: print row,'results processed'
# Get the home and away teams
hometeam = results['HOME'][row]
awayteam = results['AWAY'][row]
# Check if the game has been processed previously
if usecache:
gamecode = results.loc[row,'Game Code']
if gamecode in processedgames:
# Grab the row from the cache
ref = cached_games[cached_games['Game Code'] == gamecode].index.tolist()
results.loc[row,] = cached_games.loc[ref,]
else:
# Get the elos from the elo dictionary
awayElo = elos_dict[awayteam]
homeElo = elos_dict[hometeam]
# Write to the df
results.loc[row,'Home Elo'] = homeElo
results.loc[row,'Away Elo'] = awayElo
roundnumber = results.loc[row,'Round_number']
homeK = 50
awayK = 50
if type(roundnumber) != str and roundnumber < 4:
# Get the team K factors
homeK = team_K_factors[hometeam]
awayK = team_K_factors[awayteam]
# Calculate the Score Expectancies
homeSE = 1 / (1 + 10 ** -((homeElo - awayElo) / 400))
awaySE = 1 / (1 + 10 ** ((homeElo - awayElo) / 400))
# Write to DF
results.loc[row,'Home Predicted Result'] = homeSE
results.loc[row,'Away Predicted Result'] = awaySE
# Get home, away and total scores
homescore = results['SCORE'][row]
awayscore = results['SCORE.1'][row]
totalscore = homescore+awayscore
# Calculate home and away score percentages
homeScorePerc = homescore / totalscore
awayScorePerc = awayscore / totalscore
# Write to DF
results.loc[row,'Home Actual Result'] = homeScorePerc
results.loc[row,'Away Actual Result'] = awayScorePerc
# Find Elo Changes
homeNewElo = homeElo + homeK * (homeScorePerc - homeSE)
awayNewElo = awayElo + awayK * (awayScorePerc - awaySE)
# Check to ensure winning teams don't lose Elo
if homescore > awayscore:
homeNewElo = max(homeNewElo,homeElo)
if awayscore > homescore:
awayNewElo = max(awayNewElo,awayElo)
newElos = {hometeam:homeNewElo, awayteam:awayNewElo}
elos_dict.update(newElos)
resultsfinish = datetime.datetime.now()
if verbose:print '\n',len(results),'total games processed\n\nProcessing complete'
In [12]:
elos_df = pd.DataFrame(elos_dict.items())
elos_df.columns = ['Team','Elo']
mean_elo = np.mean(elos_df['Elo'])
elos_df['Relative Elo'] = abs(elos_df['Elo']-mean_elo)
elos_df.sort_values(by='Relative Elo',ascending=False,inplace=True)
elos_df.reset_index(inplace=True)
del elos_df['index']
elos_df['Relative to Mean'] = (elos_df['Elo']-mean_elo)/elos_df['Relative Elo']
games_df = pd.DataFrame()
games_df['Team A'] = None
games_df['Team B'] = None
games_df.head();
In [13]:
allteams = elos_df['Team'].unique()
deadteams = set(deadteams)
for team in deadteams:
oldname = team
newname = removepunct(team)
deadteams = deadteams - {oldname}
deadteams = deadteams | {newname}
currentteams = set(allteams) - deadteams
In [14]:
elos_df_allteams = elos_df
elos_df = elos_df.ix[elos_df['Team'].isin(currentteams)]
elos_df.reset_index(inplace=True)
del elos_df['index']
In [15]:
if fixtureteams:
if verbose:print "Beginning fixturing of teams"
# %%timeit -n 1 -r 1
byeteam = ''
if len(elos_df)%2 == 1:
byeteam = removepunct(raw_input('Enter the Bye Team:\n'))
games_df = pd.DataFrame()
games_df['Team A'] = None
games_df['Team B'] = None
# Process the requested games here
games_requested['Team 1'] = games_requested['Team 1'].apply(removepunct)
games_requested['Team 2'] = games_requested['Team 2'].apply(removepunct)
games_requested['Game Code 1'] = games_requested['Team 1'] + " vs " + games_requested['Team 2']
games_requested['Game Code 2'] = games_requested['Team 2'] + " vs " + games_requested['Team 1']
for row in range(0,len(games_requested)):
team1 = games_requested.loc[row,'Team 1']
team2 = games_requested.loc[row,'Team 2']
code1 = games_requested.loc[row,'Game Code 1']
code2 = games_requested.loc[row,'Game Code 2']
# Check if the teams have been fixtured this round
# Check if the teams have played each other yet
# Check if either team has been fixtured to have the bye this week
if (
team1 not in games_df['Team A'].unique() and team1 not in games_df['Team B'].unique() and
team2 not in games_df['Team A'].unique() and team2 not in games_df['Team B'].unique() and
code1 not in playedgames and
code2 not in playedgames and
team1 != byeteam and
team2 != byeteam
):
relevantrow = len(games_df)
games_df.loc[relevantrow,'Team A'] = team1
games_df.loc[relevantrow,'Team B'] = team2
games_df.loc[relevantrow,'A Elo'] = elos_dict[team1]
games_df.loc[relevantrow,'B Elo'] = elos_dict[team2]
# Fixture the rest of the games based on relative elo
# Teams further from the mean elo are fixtured before teams closer to the mean elo
for row in range(0,len(elos_df)):
# Grab the team name, check to see if they've been fixtured yet
team = elos_df.loc[row,'Team']
team_fixtured = (team in games_df['Team A'].unique()) or (team in games_df['Team B'].unique()) or (team == byeteam)
# Grab the elo and their position relative to the mean
team_elo = elos_df.loc[row,'Elo']
rel = elos_df.loc[row,'Relative to Mean']
if rel == 1:
team_below_mean = False
if rel == -1:
team_below_mean = True
#else:
#print "Error in calculating team's elo relative to the mean\nPlease check the team entry in elos_df"
#team_below_mean = None
# If they haven't, fixture them to the nearest possible game
if not team_fixtured:
elos_df_by_elo = elos_df.sort_values(by='Elo',ascending = team_below_mean)
game_fixtured = False
roww = 0
while (not game_fixtured and roww < len(elos_df)-1):
# Grab a potential team
possibleteam = elos_df_by_elo.loc[roww,'Team']
# Check to see if they are the same, and that possible team has not been fixtured already
if (possibleteam!=team and
(possibleteam not in games_df['Team A'].unique()) and
(possibleteam not in games_df['Team B'].unique()) and
(possibleteam != byeteam) and
(abs(elos_dict[team]-elos_dict[possibleteam])<100)
):
# Generate two game codes
gamecode1 = possibleteam + " vs " + team
gamecode2 = team + " vs " + possibleteam
if gamecode1 not in playedgames and gamecode2 not in playedgames:
relevantrow = len(games_df)
games_df.loc[relevantrow,'Team A'] = team
games_df.loc[relevantrow,'Team B'] = possibleteam
games_df.loc[relevantrow,'A Elo'] = elos_dict[team]
games_df.loc[relevantrow,'B Elo'] = elos_dict[possibleteam]
game_fixtured = True
else: roww = roww+1
else: roww = roww+1
for team in currentteams:
teamInA = team in games_df['Team A'].unique()
teamInB = team in games_df['Team B'].unique()
teamfixtured = teamInA or teamInB
if not teamfixtured and verbose:
if team != byeteam:
print team,'has not been fixtured'
elif team == byeteam:
print team,'was the bye team this round and has not been fixtured'
games_df.reset_index(inplace = True)
del games_df['index']
if verbose:print "\n\nFixturing complete"
In [16]:
if fixtureteams:
games_df['Elo Diff'] = games_df['A Elo'] - games_df['B Elo']
In [17]:
if fixtureteams:
# %%timeit -n 1 -r 1
# Work out who is playing at home
for row in range(0,len(games_df)):
teamA = games_df.loc[row,'Team A']
teamB = games_df.loc[row,'Team B']
teamAhome = len(hometeams[hometeams == teamA])
teamBhome = len(hometeams[hometeams == teamB])
if teamAhome > teamBhome:
games_df.loc[row,'Home'] = teamB
games_df.loc[row,'Away'] = teamA
else:
games_df.loc[row,'Home'] = teamA
games_df.loc[row,'Away'] = teamB
games_df.loc[row,'Game Code'] = games_df.loc[row,'Home'] + " vs " + games_df.loc[row,'Away']
#print games_df.loc[row,'Game Code']
fixturingfinish = datetime.datetime.now()
In [18]:
if verbose or reporttime:
if resultsfinish:
resultstime = resultsfinish - starttime
print "It took",resultstime,'to process the results'
if fixturingfinish and resultsfinish:
fixturetime = fixturingfinish - resultsfinish
print "It took",fixturetime,'to fixture the teams'
if fixturingfinish:
totaltime = fixturingfinish - starttime
print "It took",totaltime,'in total to run this script'
In [ ]:
In [ ]: